#### Replication: 
#### Why we should use the Gini coefficient to assess Punctuated Equilibrium Theory 
#### Constantin Kaplaner, GSI, LMU Munich
#### Yves Steinebach, GSI, LMU Munich
#### R Version used: 4.0.2 (2020-06-22)

# Start runtime ----
start.time <- Sys.time()
# Required Packages ----
library(tidyverse)  # version used: 1.3.0
library(DescTools)  # version used: 0.99.37
library(reshape2)   # version used: 1.4.4
library(gridExtra)  # version used: 2.3
library(Lmoments)   # version used: 1.3-1
library(stats)      # version used: 4.0.2
library(e1071)      # version used: 1.7-3
library(ggrepel)    # version used: 0.8.2
library(gglorenz)   # version used: 0.0.2

########## Main Text ####################
# 3. Sensitivity and precision of (L-)kurtosis and Gini coefficient: First simulation (p. 3)----

# Simulate Data
N <- 10000
set.seed(123)
simulated_values <- rt(N, df = 4)
simulated_values_df <- data.frame(simulated_values)

# Create Plot for Figure 1, reported on p. 3
quants <- simulated_values_df %>% 
  summarize(lower = quantile(simulated_values, probs = 0.005),
            upper = quantile(simulated_values, probs = .995))

Figure_1_1 <-  ggplot(simulated_values_df, aes(x = simulated_values)) +
  geom_density(fill = "grey",
               alpha = 0.8,
               color = "black") +
  geom_vline(aes(xintercept = quants$lower, linetype = ".005 & .995 Quantile")) + 
  geom_vline(aes(xintercept = quants$upper, linetype = ".005 & .995 Quantile")) +
  scale_linetype_manual(values = c(".005 & .995 Quantile" = "dashed"),
                        name = "") +
  xlab("Value") +
  ylab("Density") +
  theme_minimal() +
  theme(
    legend.title = element_blank(),
    legend.position = c(0.8, 0.8),
    legend.box.background = element_rect(colour = "grey")
  )

# Stepwise Estimation: Stepwise remove largest value (biggest outlier) and (re)estimate values

kurt_estimates_stepwise <- list()
for (i in 1:(N / 100)) {
  kurt_estimates_stepwise[[i]]  <- kurtosis(simulated_values[-order(-abs(simulated_values))[1:i]])
}

lkurt_estimates_stepwise <- list()
for (i in 1:(N / 100)) {
  lkurt_estimates_stepwise[[i]]  <- Lcoefs(simulated_values[-order(-abs(simulated_values))[1:i]])
  lkurt_estimates_stepwise[[i]] <- lkurt_estimates_stepwise[[i]][[4]] #the l-kurtosis is saved as the 4th value
}

gini_estimates_stepwise <- list()
for (i in 1:(N / 100)) {
  gini_estimates_stepwise[[i]]  <- Gini(abs(simulated_values[-order(-abs(simulated_values))[1:i]]), unbiased = F)
}

# Define normalize function
normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x)))
}

# Create data frame
# Kurtosis
kurt_100 <- as.data.frame(unlist(kurt_estimates_stepwise))
kurt_100$removes <- order(-row_number(kurt_100))
kurt_100 <- rbind(kurt_100, c(kurtosis(simulated_values),0)) #Adds back the "0" value (all observations)
colnames(kurt_100) <- c("kurt", "removed")

# L-Kurtosis
lkurt_100 <- as.data.frame(unlist(lkurt_estimates_stepwise))
lkurt_100$removes <- order(-row_number(lkurt_100))
lkurt_100 <- rbind(lkurt_100, c(Lcoefs(simulated_values)[,4],0)) #Adds back the "0" value (all observations)
colnames(lkurt_100) <- c("lkurt", "removed")

# Gini
gini_100 <- as.data.frame(unlist(gini_estimates_stepwise))
gini_100$removes <- order(-row_number(gini_100))
gini_100 <- rbind(gini_100, c(Gini(abs(simulated_values)),0)) #Adds back the "0" value (all observations)
colnames(gini_100) <- c("gini", "removed")

# Normalize the values between 0 and 1 for comparability
lkurt_100$lkurt <- normalize(lkurt_100$lkurt)
kurt_100$kurt <- normalize(kurt_100$kurt)
gini_100$gini <- normalize(gini_100$gini)

# Combine into one data frame for plotting
stepwise_df <- cbind(gini_100, lkurt_100, kurt_100)
stepwise_df <- melt(stepwise_df, id="removed")


# Create plot for outliers, reported on p. 3
Figure_1_2 <- ggplot(stepwise_df, aes(x = removed, y = value, color = variable)) +
  geom_point(alpha = 0.7, size = 1) +
  geom_line() +
  xlab("Number of observations removed") +
  scale_color_manual(
    values = c("#3D5467", "#FF8966", "#DB5461"),
    labels = c(
      "Gini Coefficient (G)",
      expression(L - Kurtosis ~ (tau[4])),
      "Kurtosis (k)"
    )
  ) +
  ylab("Measure") +
  theme_minimal() +
  theme(
    legend.title = element_blank(),
    legend.position = c(0.7, 0.7),
    legend.box.background = element_rect(colour = "grey")
  )

# Figure 1, reported on p. 3
pdf("Figure1.pdf",width = 14 ,height=3)
grid.arrange(Figure_1_1, Figure_1_2, widths=c(0.5, 0.5)) 
dev.off()

rm(list=setdiff(ls(), c("Figure_1_1","Figure_1_2","start.time","quants")))

# 3. Sensitivity and precision of (L-)kurtosis and Gini coefficient: Second simulation (p. 3-4) ----

# Estimate Gini Coefficient and L-Kurtosis
# Based on 10.000 draws of the size 250 from t-distribution with df=4

N = 250
freedom = 4

#Estimate gini values
set.seed(123) # Seed is fixed for both simulation to generate same underlying values
gini_sim2 <- replicate(10000, Gini(abs(rt(N, df = freedom)), unbiased = F))

set.seed(123) # Seed is fixed for both simulation to generate same underlying values
lkurt_sim2 <- replicate(10000, Lcoefs(rt(N, df = freedom))[4])

# Create dataframe
sim2_df <- data.frame(gin = gini_sim2, lkurt = lkurt_sim2)

# Calculate correlation between values, reported on p.3
correllation_gini_lkurtosis_sim2 <- round(cor(sim2_df$gin, sim2_df$lkurt, method = "pearson"), 1)
correllation_gini_lkurtosis_sim2

# Standard deviation of the values, reported on p.4
sd_lkurtosis_sim2 <- round(sd(sim2_df$lkurt), 2)
sd_lkurtosis_sim2

sd_gini_sim2 <- round(sd(sim2_df$gin), 2)
sd_gini_sim2

# Coefficient of variation, reported on p.4
cv_lkurtosis_sim2 <- round(sd(sim2_df$lkurt) / mean(sim2_df$lkurt), 2)
cv_lkurtosis_sim2

cv_gini_sim2 <- round(sd(sim2_df$gin) / mean(sim2_df$gin), 2)
cv_gini_sim2

# Sample size = 500 to reach SD = 0.02, reported on p.4
set.seed(123) # Seed is fixed for both simulation to generate same underlying values 
sd_lkurtosis_500_sim2 <- round(sd(replicate(10000, Lcoefs(rt(500,4))[4])),2)
sd_lkurtosis_500_sim2

# Sample size = 1850 to reach CV = 0.05, reported on p. 4
set.seed(123) # Seed is fixed for both simulation to generate same underlying values 
lkurt2 <- replicate(10000, Lcoefs(rt(n=1850,df=4))[4])
cv_lkurtosis_1850_sim2 <- round(sd(lkurt2) / mean(lkurt2),2) # Coefficient of Variation
cv_lkurtosis_1850_sim2

# Density plots for Figure 2, reported on p. 4
Figure_2_1 <- ggplot(sim2_df, aes(x=gin)) + geom_density(fill="black",alpha = 0.2) + 
  geom_vline(aes(xintercept=mean(gin), linetype ="Mean")) +
  geom_vline(aes(xintercept = 1.96*sd(gin) + mean(gin), linetype="95% CI")) +
  geom_vline(aes(xintercept = -1.96*sd(gin) + mean(gin), linetype="95% CI")) +
  scale_linetype_manual(values = c("Mean" = "solid", "95% CI" = "dashed"), name="")+
  xlim(0,1)+
  ylim(0,26)+
  theme_minimal()+
  theme(legend.title = element_blank(), legend.position=c(0.75,0.75), 
        legend.direction = "horizontal",
        legend.box.background = element_rect(colour = "grey"))+
  xlab("Gini Coefficient (G)")+
  ylab("Density")

Figure_2_2<- ggplot(sim2_df, aes(x=lkurt)) + geom_density(fill="black",alpha = 0.2) + 
  geom_vline(aes(xintercept=mean(lkurt), linetype ="Mean")) +
  geom_vline(aes(xintercept = 1.96*sd(lkurt) + mean(lkurt), linetype="95% CI")) +
  geom_vline(aes(xintercept = -1.96*sd(lkurt) + mean(lkurt), linetype="95% CI")) +
  scale_linetype_manual(values = c("Mean" = "solid","95% CI" = "dashed"), name="")+
  xlim(0,1)+
  ylim(0,26)+
  theme_minimal()+
  theme(legend.title = element_blank(), legend.position=c(0.75,0.75), 
        legend.direction = "horizontal",
        legend.box.background = element_rect(colour = "grey"))+
  xlab(expression("L-Kurtosis "(tau[4])))+
  ylab("Density")

# Figure 2, p. 4
pdf("Figure2.pdf",width = 14,height=3)
grid.arrange(arrangeGrob(Figure_2_1, Figure_2_2, ncol = 2)) # Second row with 2 plots in 2 different columns
dev.off()

#Clean workspace
rm(sim2_df, freedom, gini_sim2, lkurt_sim2, lkurt2, N)

# 4. Implications for Type I errors: Third simulation (p. 4-5) ----
#" True" value for L-Kurtosis for a standard normal, reported on p. 4
set.seed(111)
ltrue_sim3 <- replicate(10000, Lcoefs(rnorm(1000))[4]) 

true_value_lkurtosis_sim3 <- round(mean(ltrue_sim3),3)
true_value_lkurtosis_sim3

# "True" value for Gini for a standard normal, reported on p. 4
set.seed(111)
gtrue_sim3 <- replicate(10000, Gini(abs(rnorm(1000))))
true_value_gini_sim3 <- round(mean(gtrue_sim3),3)
true_value_gini_sim3

# Compare true value to values from different sample sizes
# Simulation for Gini coefficient
rejection_gini_sim3 <- list() #empty list to store values
for(i in 50:500) {
  
  n = i # Sample size
  t= i -49 #  Indexing
  set.seed(i) # Fix seed to sample number to guarantee comparability
  tg <- replicate(1000, Gini(abs(rnorm(n)), unbiased = T))
  rejection_gini_sim3[t] <- length(tg[tg > (mean(gtrue_sim3) + 0.05)]) # Count number of times the simulated value is higher than true value by 0.05
}

# Simulation for L-Kurtosis
rejection_lkurtosis_sim3 <- list()
for(i in 50:500) {
  n = i # Sample size
  t= i -49 # Indexing
  set.seed(i) # Fix seed to sample number to guarantee comparability
  tl <- replicate(1000, Lcoefs(rnorm(n))[4])
  rejection_lkurtosis_sim3[t] <- length(tl[tl > (mean(ltrue_sim3) + 0.05)]) # Count number of times the simulated value is higher than true value by 0.05
}

sim3_df<- do.call(rbind, Map(data.frame, Gini=rejection_gini_sim3, LKurtosis=rejection_lkurtosis_sim3))
sim3_df$nr <- 50:500 # Add sample size 

# Convert to long for plotting
sim3_df <- melt(sim3_df, id.vars = "nr") 

# Figure 3, p. 5
Figure_3 <- ggplot(sim3_df, aes(x=nr,y=value/1000 * 100,color=variable,fill=variable)) +  #divided by 1000 * 100 to get percent
  geom_point(alpha=0.3,size=0.8)+
  geom_smooth(method="loess")+
  ylab("Frequency of Rejection of H0 in %")+
  xlab("Sample Size")+
  scale_color_manual(values=c("#3D5467", "#FF8966"),labels=c("Gini Coefficient (G)",expression("L-Kurtosis "(tau[4]))))+
  scale_fill_manual(values=c("#3D5467", "#FF8966"),
                    labels=c("Gini Coefficient (G)",expression("L-Kurtosis "(tau[4]))))+
  theme_minimal()+
  theme(legend.title = element_blank())+
  xlim(50,500)

pdf("Figure3.pdf",width = 14,height=4)
Figure_3
dev.off()

# Clear workspace
rm(sim3_df, gtrue_sim3, ltrue_sim3, n, i, t, tg, tl, rejection_gini_sim3, rejection_lkurtosis_sim3)

#### Replication: Supplementary Material: 
#### Why we should use the Gini coefficient to assess Punctuated Equilibrium Theory 


########## Supplementary Material #######
# Supplementary Material: 2.1 Bootstrap on US Budget Outlays (p. 10-11) ----
# Load data from Fatke et al.,2019 // https://doi.org/10.7910/DVN/8DWAZW 
# NOTE: Please cite original source when using this data!
budget <- read.table(url("https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/8DWAZW/XUFVZT"), header = T)

# Calculate change
budget <- diff(budget$outlays)

# Calculate Gini and L-Kurtosis, reported in Supplementary Material p. 10
true_gini_budget <- round(Gini(abs(budget)),2)
true_gini_budget

true_lkurtosis_budget <- round(Lcoefs(budget)[4],2)
true_lkurtosis_budget

# Bootstrap
set.seed(123)
gini_boot_budget <- replicate(10000,Gini(abs(sample(budget,204,replace = T))))

set.seed(123)
lkurtosis_boot_budget <- replicate(10000,(Lcoefs(sample(budget,204,replace=T))[4]))

# Standard Deviation, reported in Supplementary Material p. 10
sd_gini_budget <- round(sd(gini_boot_budget),2)
sd_gini_budget

sd_lkurtosis_budget <- round(sd(lkurtosis_boot_budget),2)
sd_lkurtosis_budget

# CV, reported on p. 10
cv_lkurtosis_budget <- round(sd(lkurtosis_boot_budget) / mean(lkurtosis_boot_budget),2)
cv_lkurtosis_budget

cv_gini_budget <- round(sd(gini_boot_budget) / mean(gini_boot_budget),2)
cv_gini_budget

# Correlation, reported in Supplementary Material p. 10
correlation_gini_lkurtosis_budget <- round(cor(lkurtosis_boot_budget,gini_boot_budget, method ="pearson"),1)
correlation_gini_lkurtosis_budget

# Figure A1, reported in Supplementary Material p. 10
boot_df <- data.frame(Gini=gini_boot_budget,Lkurtosis=lkurtosis_boot_budget)
boot_df <- melt(boot_df)
boot_df$full <- ifelse(boot_df$variable == "Gini", true_gini_budget, true_lkurtosis_budget)

Figure_A1 <- ggplot(boot_df, aes(x=value,color=variable,fill=variable)) + 
  geom_density(alpha=0.5)+
  geom_vline(aes(xintercept=full, color=variable), linetype="solid")+
  scale_color_manual(values=c("#3D5467","#FF8966"), 
                     labels=c("Gini Coefficient (G)",expression("L-Kurtosis "(tau[4]))))+
  scale_fill_manual(values=c("#3D5467","#FF8966"), 
                    labels=c("Gini Coefficient (G)",expression("L-Kurtosis "(tau[4]))))+  
  theme_minimal()+
  theme(legend.position = "none")+
  facet_wrap(~variable)+
  theme(legend.title = element_blank())+
  xlab("Measure")+
  ylab("Density")+
  xlim(0,1)

pdf("FigureA1.pdf", width = 15,height=4)
Figure_A1
dev.off()

# Gini absolute values versus split
neg_budget <- budget[budget <= 0]
pos_budget <- budget[budget > 0]

# Gini of positive / negative values, reported in Supplementary Materialn p. 11
gini_negative_budget <- round(Gini(abs(neg_budget)),2)
gini_positive_budget <- round(Gini(abs(pos_budget)),2)

# Weighted average of split values, reported in Supplementary Material p. 11
gini_weighted_budget <- round(((Gini(abs(neg_budget)) * length(neg_budget)) + 
                                 (Gini(abs(pos_budget)) * length(pos_budget))) / (length(neg_budget)+length(pos_budget)),2)
gini_weighted_budget

rm(boot_df, budget, gini_boot_budget, lkurtosis_boot_budget, neg_budget, pos_budget)

# Supplementary Material: 2.2 Example for possible Type I error caused by imprecision (p. 11-13) ----

# Load replication of Lundgren et al. 2018 // https://doi.org/10.1007/s11558-017-9288-x
# NOTE: Please cite original source when using this data!
io_df<- read.csv(url("https://static-content.springer.com/esm/art%3A10.1007%2Fs11558-017-9288-x/MediaObjects/11558_2017_9288_MOESM5_ESM.csv"))
io_df <- subset(io_df, IO != "UNSC") # We exclude UNSC since Lundgren et al. did not include it in their anylsis

# Density distributions for Figure 2A
Figure_A2_1 <- ggplot(io_df, aes(x=diff,color=IO)) + geom_density()+
  theme_minimal()+
  xlab("Change size")+
  ylab("Density")+
  ggtitle("Density Distributions")+
  theme(plot.title = element_text(size = 11, color = "grey30", face = "bold",vjust=3))

# Calculate Gini coefficient and L-Kurtosis
values_io_df <- io_df %>% 
  group_by(IO) %>% 
  summarize(Gini=Gini(abs(diff),unbiased = T),
            LKurtosis=Lcoefs(diff)[4])

# Include friction values as calculated by Lundgren et al. (2018)
values_io_df$friction <- ifelse(values_io_df$IO == "UN", 11, NA)
values_io_df$friction <- ifelse(values_io_df$IO == "EU", 5, values_io_df$friction )
values_io_df$friction <- ifelse(values_io_df$IO == "AU", 10, values_io_df$friction)
values_io_df$friction <- ifelse(values_io_df$IO == "OAS", 8, values_io_df$friction)
values_io_df$friction <- ifelse(values_io_df$IO == "OIC", 13, values_io_df$friction)

# Lorenz Curve and Friction plot for Figure 2A
values_io_df <- melt(values_io_df, id.vars=c("IO","friction"))

Figure_A2_2 <- ggplot(io_df, aes(x=abs(diff), color=IO)) + 
  stat_lorenz()+
  theme_minimal()+
  xlab("Cumulative share of Years")+
  ylab("Cumulative share of Policy Change")+
  ggtitle("Lorenz Curve")+
  theme(plot.title = element_text(size = 11, color = "grey30", face = "bold",vjust=3))

Figure_A2_3 <- ggplot(values_io_df, aes(x=friction, y=value, label=IO, color=IO)) + 
  geom_point()+
  geom_label_repel()+
  facet_wrap(~variable, scales="free")+
  theme_minimal()+
  xlab("Level of Friction")+
  ylab("Measurement")+
  theme(legend.position = "none")+
  ggtitle(c("Measurements of Gini Coefficient and L-Kurtosis"))+
  theme(plot.title = element_text(size = 11, color = "grey30", face = "bold",vjust=3))

# Figure A2, supplementary material, reported on p. 12
pdf("FigurA2.pdf", width=10, height=12,)
grid.arrange(Figure_A2_1,Figure_A2_2,Figure_A2_3, ncol=1)
dev.off()

# Clear Workspace
rm(io_df, values_io_df)

# Supplementary Material: 3. L-Kurtosis and Gini: Type II errors (p. 13-14) ----
# "True" value for L-Kurtosis for a standard normal
set.seed(111)
ltrue_simA1 <- replicate(10000, Lcoefs(rnorm(1000))[4]) 
mean(ltrue_simA1)

# "True" value for Gini for a standard normal
set.seed(111)
gtrue_simA1 <- replicate(10000, Gini(abs(rnorm(1000))))
mean(gtrue_simA1)

# Simulation for Gini coefficient
reject_gini_simA1 <- list() #empty list to store values
for(i in 2:30) {
  
  n = 250 # Sample size
  t= i -1 #  Indexing
  set.seed(i+100) # Fix seed to sample number to guarantee comparability
  tg <- replicate(1000, Gini(abs(rt(n,i)), unbiased = T))
  reject_gini_simA1[t] <- length(tg[tg > mean(gtrue_simA1) + 0.05]) # Count number of times the simulated value is higher 
  # than true value by 0.05
}

# Simulation for L-Kurtosis
reject_lkurtosis_simA1 <- list()
for(i in 2:30) {
  n = 250 # Sample size
  t= i -1 # Indexing
  set.seed(i+100) # Fix seed to sample number to guarantee comparability
  tl <- replicate(1000, Lcoefs(rt(n,i))[4])
  reject_lkurtosis_simA1[t] <- length(tl[tl > mean(ltrue_simA1) + 0.05]) # Count number of times the simulated value is higher 
  # than true value by 0.05
}

# Combine into one dataframe
simA1_df <- do.call(rbind, Map(data.frame, Gini=reject_gini_simA1, LKurtosis=reject_lkurtosis_simA1)) # Convert to data frame
simA1_df$nr <- 2:30 # Add sample size 

# Convert to long for plotting
simA1_df <- melt(simA1_df, id.vars = "nr") 

#Figure A3, reported p. 14
Figure_A3 <- ggplot(simA1_df, aes(x=nr,y=value/1000 * 100,color=variable,fill=variable)) +  #divided by 1000 * 100 to get percent
  geom_point(alpha=0.3,size=0.5)+
  geom_smooth()+
  ylab("Frequency of Rejection of H0 in %")+
  xlab("Degrees of freedom of T-Distribution")+
  scale_color_manual(values=c("#3D5467", "#FF8966"),labels=c("Gini Coefficient (G)",expression("L-Kurtosis "(tau[4]))))+
  scale_fill_manual(values=c("#3D5467", "#FF8966"),
                    labels=c("Gini Coefficient (G)",expression("L-Kurtosis "(tau[4]))))+
  theme_minimal()+
  theme(legend.title = element_blank())

pdf("FigureA3.pdf",width = 15,height=5)
Figure_A3
dev.off()

rm(t,tg,tl,n,i, gtrue_simA1, ltrue_simA1, simA1_df, reject_gini_simA1, reject_lkurtosis_simA1)

# End runtime ----
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken
